1 Introduction

The goals of this analysis are to:

  1. To identify community state types (CSTs) based on the composition and relative abundance of bacterial species
  2. To assess whether the treatment caused individual participant transitions between CSTs, particularly towards Lactobacillus dominated CSTs
  3. Compare genital cytokine concentrations with prevalent CSTs

To answer these questions, 56 women with a laboratory-diagnosed sexually transmitted infection (STI)and/or BV-intermediate or positive at baseline, attended all subsequent visits (weeks 6 and 12), and had genital specimen stored to conduct the longitudinal microbiota and cytokine analysis were included. 16S rRNA gene sequencing and multiplex bead arrays were used to characterize the composition of vaginal microbial communities and soluble biomarkers of genital inflammation in cervicovaginal swabs and SoftCup fluids. Community state types were inferred using VALENCIA, a nearest centroid classification method for vaginal microbial communities based on composition. Linear regression analysis was used to compare genital cytokine concentrations with prevalent CSTs to test the hypothesis that vaginal microbiota dominated with non-Lactobacillus species would be associated with increased cytokine concentrations compared to communities dominated by Lactobacillus speciesPrincipal component analysis (PCA) was used to obtain summary measures for the multi-variable cytokine set and performed using the FactoMineR package (https://cran.r project.org/web/packages/FactoMineR/index.html) in R (www.r-project.org).

1.1 Part A: Characterisation of female genital tract microbiome

What is the distribution of bacteria in these women? is there any form of clustering?

1.2 Key results

Three major CSTs identified by VALENCIA

  • CST I dominated by Lactobacillus crispatus,
  • CST III dominated by Lactobacillus iners,
  • CST IV characterized by a wide array of strict and facultative anaerobes without a consistent dominant species.
  • CST IV was subdived into CST IV-A representing samples with high relative abundance of BVAB1,
  • CST IV-B characterized by Gardnerella vaginalis dominance
  • CST IV-C characterized by an even community with moderate abundance of Prevotella bivia.
## CSTs custom color color pallete
CSTs_all <- c("I", "II", "III",  "IV-A", "IV-B", "IV-C", "V")
CSTsPallete <- c("#7CAE00", "#ff0033", "#C77CFF",  "#ff33cc", "#6699CC", "pink", "blue") # "purple", "orange"
CSTsPalleteNamed <- c("#7CAE00", "#ff0033", "#C77CFF",  "#ff33cc", "#6699CC", "#ff0066", "#ff9900")
names(CSTsPalleteNamed) <- CSTs_all
CSTsPalleteValues <- c("I"="#7CAE00", "II"="#ff0033", "III"="#C77CFF",  "IV-A"="#ff33cc", "IV-B"="#6699CC", "IV-C"="#ff0066", "V"="#ff9900")

visitsPallet <- c("lightskyblue4", "red", "lightskyblue")
proInfC <- c("TNF.b"  , "IL.12p40" , "IL.12p70" , "IL.1a" , "IL.6" , "TNF.a" , "IL.1b", "IL.18" , "MIF" , "TRAIL")

allCytokineColnames <- c("IL.1a","IL.1b","IL.6","IL.12p40","IL.12p70","IL.18", "MIF","TNF.a","TNF.b","TRAIL","IL.8","IL.16","CTACK","Eotaxin","IP.10","IFN.a2","GROa","MCP.1","MCP.3","MIG","MIP.1a", "MIP.1b","RANTES","IL.3","IL.7",  "IL.9","b.NGF","FGF.basic","G.CSF","GM.CSF",  "HGF", "LIF","M.CSF", "PDGF.bb", "SCF","SCGF.b", "SDF.1a", "VEGF","IL.2Ra", "IL.2",  "IL.4", "IL.5","IL.13","IL.15","IL.17","IFN.g","IL.1ra", "IL.10")
## 

2 Read in and preprocess data

Read in data, prepare for analysis and perform sanity checks

#Read in data

#tax table
tax_tab <- data.frame(readRDS("dada2-Chimera-Taxonomy/tax_table_final.RDS"))
tax_tab$Species <- as.character(tax_tab$Species)

## Use precise taxonomy
tax_tab[tax_tab$Species %in% "Lactobacillus_crispatus_Lactobacillus_helveticus", "Species"] <- "Lactobacillus_crispatus"
tax_tab[tax_tab$Species %in% "Lactobacillus_acidophilus", "Species"] <- "Lactobacillus_crispatus"
tax_tab[tax_tab$Species %in% "Lactobacillus_gasseri_Lactobacillus_johnsonii", "Species"] <- "Lactobacillus_gasseri"
tax_tab[tax_tab$Species %in% "Prevotella_timonensis", "Species"] <- "Prevotella_bivia"

tax_tab$Species <- dplyr::case_when(substr(tax_tab$Species,1,2) %in% "d_" ~ tax_tab$Species,
                                    substr(tax_tab$Species,1,2) %in% "p_" ~ tax_tab$Species,
                                    substr(tax_tab$Species,1,2) %in% "c_" ~ tax_tab$Species,
                                    substr(tax_tab$Species,1,2) %in% "f_" ~ tax_tab$Species,
                                    substr(tax_tab$Species,1,2) %in% "o_" ~ tax_tab$Species,
                                    substr(tax_tab$Species,1,2) %in% "g_" ~ tax_tab$Species,
                                    TRUE  ~ gsub("_", " ", tax_tab$Species))

tax_tab <- tax_table(as.matrix(tax_tab))

## Import sample data
mapping.data <- readRDS("processed/sample_data.RDS")
rownames(mapping.data) <- mapping.data$SampleID

## Import pecan csts
pecan_csts <- readRDS("metadata/metadata_cst.RDS")
rownames(pecan_csts) <- pecan_csts$SampleID

## Add pecan csts to sample data
mapping.data_ <- inner_join(mapping.data, pecan_csts[, c(1,15:18)], by = "SampleID")
dim(mapping.data_)
## [1] 392  79
rownames(mapping.data_) <- mapping.data_$SampleID

## Import Otu table
otu_tab <- readRDS("dada2-Chimera-Taxonomy/seqtab_final.RDS")
class(otu_tab) <- "numeric"

## Phylogenetic tree
phy <- readRDS("dada2-Phangorn/phangorn.tree.RDS")

## Create phyloseq object
ps00 <- phyloseq(tax_table(tax_tab), otu_table(otu_tab, taxa_are_rows = F), phy_tree(phy$tree))
ps00
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 11018 taxa and 399 samples ]
## tax_table()   Taxonomy Table:    [ 11018 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 11018 tips and 11016 internal nodes ]
## Now prune extra samples not used from sample data
ps0 <- prune_samples(mapping.data$SampleID, ps00)
ps0 <- prune_taxa(taxa_sums(ps0) > 0, ps0)
ps0
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 10723 taxa and 392 samples ]
## tax_table()   Taxonomy Table:    [ 10723 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 10723 tips and 10721 internal nodes ]
## Add metadata to phyloseq object
sample_data(ps0) <- mapping.data_
ps0
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 10723 taxa and 392 samples ]
## sample_data() Sample Data:       [ 392 samples by 79 sample variables ]
## tax_table()   Taxonomy Table:    [ 10723 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 10723 tips and 10721 internal nodes ]
## Perform a few sanity checks. Results should match what is expected.
# sample_variables(ps0) # Display variables from the mapping file
cat("Total number of taxa in the entire dataset\n", ntaxa(ps0)) 
## Total number of taxa in the entire dataset
##  10723
cat("Total number of samples \n", nsamples(ps0))
## Total number of samples 
##  392
# rank_names(ps0) # Taxonomic ranks to confirm proper naming otherwise, correct
# colnames(tax_table(ps0)) <- c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species", "Strain")

2.1 Filter out taxa without Phylum

Taxa without assignment at the Phylumn level are likely spurious and will not be relevant in the downstream analyses so dropping those. In this case we filter out the ’NA’s and ""s.

#Phylum in our data and sample counts at these phylum. Notice <NA>, samples not assigned to any phylum. We remove these
get_taxa_unique(ps0, "Phylum")
##  [1] "Firmicutes"        "Actinobacteriota"  "Bacteroidota"     
##  [4] "Fusobacteriota"    NA                  "Proteobacteria"   
##  [7] "Patescibacteria"   "Parabasalia"       "Campilobacterota" 
## [10] "Verrucomicrobiota" "Spirochaetota"     "Cyanobacteria"    
## [13] "Euglenozoa"        "Retaria"           "Acidobacteriota"  
## [16] "Vertebrata"        "Chloroflexi"       "Deinococcota"
#Determine taxa without Phylum assignment
cat("Taxa distribution by Phylum")
## Taxa distribution by Phylum
table(tax_table(ps0)[, "Phylum", exclude = NULL])
## 
##   Acidobacteriota  Actinobacteriota      Bacteroidota  Campilobacterota 
##                 6              2394              1469                25 
##       Chloroflexi     Cyanobacteria      Deinococcota        Euglenozoa 
##                 1                 5                 1                 1 
##        Firmicutes    Fusobacteriota       Parabasalia   Patescibacteria 
##              5302               631                 5                52 
##    Proteobacteria           Retaria     Spirochaetota Verrucomicrobiota 
##               336                 1                 4                 7 
##        Vertebrata 
##                 4
ps0.Phylum <- subset_taxa(ps0, !is.na(Phylum) & !Phylum %in% c("", "uncharacterized"))
table(tax_table(ps0.Phylum)[, "Phylum"], exclude = NULL)
## 
##   Acidobacteriota  Actinobacteriota      Bacteroidota  Campilobacterota 
##                 6              2394              1469                25 
##       Chloroflexi     Cyanobacteria      Deinococcota        Euglenozoa 
##                 1                 5                 1                 1 
##        Firmicutes    Fusobacteriota       Parabasalia   Patescibacteria 
##              5302               631                 5                52 
##    Proteobacteria           Retaria     Spirochaetota Verrucomicrobiota 
##               336                 1                 4                 7 
##        Vertebrata 
##                 4
# Remove taxa that is not present in any sample (keep only those assigned at least one sample)

# summary(taxa_sums(ps0.Phylum))
ps0.Phylum <- prune_taxa(taxa_sums(ps0.Phylum) > 0, ps0.Phylum)

cat("\nTaxa Summary after filtering out 0 read count taxa\n")
## 
## Taxa Summary after filtering out 0 read count taxa
summary(taxa_sums(ps0.Phylum))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       1      18     158    3132    2000  301221
# How many Phylum would be present after filtering?
cat("\nNumber of unique pylum remaining after filtering:\n", length(get_taxa_unique(ps0.Phylum, taxonomic.rank = "Phylum")))
## 
## Number of unique pylum remaining after filtering:
##  17

2.1.1 Filter out non-bacterial taxa

Sample preparation and handling can introduce contamination (non-bacterial DNA). We need to check for and remove all this DNA from the remaining samples.

# Some examples of taxa you may not want to include in your analysis
get_taxa_unique(ps0.Phylum, "Kingdom")
## [1] "Bacteria"  "Eukaryota"
get_taxa_unique(ps0.Phylum, "Phylum")
##  [1] "Firmicutes"        "Actinobacteriota"  "Bacteroidota"     
##  [4] "Fusobacteriota"    "Proteobacteria"    "Patescibacteria"  
##  [7] "Parabasalia"       "Campilobacterota"  "Verrucomicrobiota"
## [10] "Spirochaetota"     "Cyanobacteria"     "Euglenozoa"       
## [13] "Retaria"           "Acidobacteriota"   "Vertebrata"       
## [16] "Chloroflexi"       "Deinococcota"
ps0.Phylum # Check the number of taxa prior to removal
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 10244 taxa and 392 samples ]
## sample_data() Sample Data:       [ 392 samples by 79 sample variables ]
## tax_table()   Taxonomy Table:    [ 10244 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 10244 tips and 10242 internal nodes ]
ps0.phylum.bact <- ps0.Phylum %>%
  subset_taxa(
    Kingdom == "Bacteria" &
    Family  != "mitochondria" &
    Class   != "Chloroplast" & 
    Phylum != "Cyanobacteria"
  )
ps0.phylum.bact # Confirm that the taxa were removed
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 9548 taxa and 392 samples ]
## sample_data() Sample Data:       [ 392 samples by 79 sample variables ]
## tax_table()   Taxonomy Table:    [ 9548 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 9548 tips and 9546 internal nodes ]
get_taxa_unique(ps0.phylum.bact, "Kingdom")
## [1] "Bacteria"
get_taxa_unique(ps0.phylum.bact, "Class")
##  [1] "Bacilli"             "Actinobacteria"      "Coriobacteriia"     
##  [4] "Bacteroidia"         "Clostridia"          "Fusobacteriia"      
##  [7] "Negativicutes"       "Gammaproteobacteria" "Alphaproteobacteria"
## [10] "Campylobacteria"     "Chlamydiae"          "Spirochaetia"       
## [13] "Acidobacteriae"      "Thermoleophilia"     "Verrucomicrobiae"   
## [16] "Chloroflexia"        "Deinococci"          "Saccharimonadia"

2.1.2 Prevalance assessment*

Plot Phyla present along with information about their prevalence (i.e. fraction of samples they are present in) and total abundance accross samples. Dashed horizontal line to show the 5% margin at which we intend to filter.

## Prevalence estimation
# Calculate feature prevalence across the data set. i.e all the samples in which an ASV is found
df.ps0.prevalence <- apply(X = otu_table(ps0.phylum.bact), MARGIN = ifelse(taxa_are_rows(ps0.phylum.bact), yes = 1, no = 2), FUN = function(x){sum(x > 0)})
 
# Add taxonomy and total read counts (all sequences from a sample) to df.prevalence
df.ps0.prevalence <- data.frame(Prevalence = df.ps0.prevalence, TotalAbundance = taxa_sums(ps0.phylum.bact), tax_table(ps0.phylum.bact))
message("Prevalence range (min max): ", min(df.ps0.prevalence$Prevalence), max(df.ps0.prevalence$Prevalence), "\n")
message("Prevalence Summary Stats: ", summary(df.ps0.prevalence$Prevalence), "\n")

#Prevalence plot
df.ps0.prevalence.phylum <- subset(df.ps0.prevalence, Phylum %in% get_taxa_unique(ps0.phylum.bact, "Phylum"))
gg.ps0.prevalence <- ggplot(df.ps0.prevalence.phylum, aes(TotalAbundance, Prevalence / nsamples(ps0.phylum.bact), color=Family)) +
  geom_hline(yintercept = 0.01, alpha = 0.8, linetype = 2) +
  geom_point(size = 3, alpha = 0.8) +
  scale_x_log10() +
  xlab("Total Abundance") + ylab("Prevalence [Frac. Samples]") +
  facet_wrap(~Phylum) +
  theme(legend.position="none") +
  ggtitle("Phylum Prevalence in All Samples\nColored by Family")
gg.ps0.prevalence

#ggplotly(gg.ps1.prevalence)

plyr::ddply(df.ps0.prevalence, "Phylum", function(df1){
  data.frame(mean_prevalence=mean(df1$Prevalence),total_abundance=sum(df1$TotalAbundance,na.rm = T),stringsAsFactors = F)
  })
##               Phylum mean_prevalence total_abundance
## 1    Acidobacteriota        1.166667             238
## 2   Actinobacteriota        4.116330         7839519
## 3       Bacteroidota        2.011461         1939577
## 4   Campilobacterota        1.000000            2749
## 5        Chloroflexi        1.000000               4
## 6       Deinococcota        1.000000               3
## 7         Firmicutes        2.835210        20550942
## 8     Fusobacteriota        2.513471         1454584
## 9    Patescibacteria        1.000000               2
## 10    Proteobacteria        1.428571          143326
## 11     Spirochaetota        1.000000             674
## 12 Verrucomicrobiota        1.285714            1279

2.1.3 Prevalence Filtering,

The data is already filtered at 5% prevalence. This step is therefore only activated as need arises for filtering.

prevalenceThreshold = 0.01 * nsamples(ps0.phylum.bact)
cat("Prevalence threshold:\n\n")
## Prevalence threshold:
prevalenceThreshold
## [1] 3.92
# Define which taxa fall within the prevalence threshold
keepTaxa <- rownames(df.ps0.prevalence.phylum)[(df.ps0.prevalence.phylum$Prevalence >= prevalenceThreshold)]
cat("\n\nTax meeting prevalence threshold:\n\n")
## 
## 
## Tax meeting prevalence threshold:
length(keepTaxa)
## [1] 1351
cat("\n\nTax meeting prevalence threshold:\n\n")
## 
## 
## Tax meeting prevalence threshold:
ntaxa(ps0.phylum.bact)
## [1] 9548
# Remove those taxa
ps1 <- prune_taxa(keepTaxa, ps0.phylum.bact)
ntaxa(ps1)
## [1] 1351
# Calculate feature prevalence across the data set. i.e all the samples in which an ASV is found
df.ps1.prevalence <- apply(X = otu_table(ps1), MARGIN = ifelse(taxa_are_rows(ps1), yes = 1, no = 2), FUN = function(x){sum(x > 0)})

# Add taxonomy and total read counts (all sequences from a sample) to df.prevalence
df.ps1.prevalence <- data.frame(Prevalence = df.ps1.prevalence, TotalAbundance = taxa_sums(ps1), tax_table(ps1))
message("Prevalence range (min max): ", min(df.ps1.prevalence$Prevalence), max(df.ps1.prevalence$Prevalence), "\n")
message("Prevalence Summary Stats: ", summary(df.ps1.prevalence$Prevalence), "\n")

#Prevalence plot
df.ps1.prevalence.phylum <- subset(df.ps1.prevalence, Phylum %in% get_taxa_unique(ps1, "Phylum"))
gg.ps1.prevalence <- ggplot(df.ps1.prevalence.phylum, aes(TotalAbundance, Prevalence / nsamples(ps1), color=Family)) +
  geom_hline(yintercept = 0.01, alpha = 0.8, linetype = 2) +
  geom_point(size = 3, alpha = 0.8) +
  scale_x_log10() +
  xlab("Total Abundance") + ylab("Prevalence [Frac. Samples]") +
  facet_wrap(~Phylum) +
  theme(legend.position="none") +
  ggtitle("Phylum Prevalence in All Samples\nColored by Family")
gg.ps1.prevalence

#ggplotly(gg.ps1.prevalence)

plyr::ddply(df.ps1.prevalence, "Phylum", function(df1){
  data.frame(mean_prevalence=mean(df1$Prevalence),total_abundance=sum(df1$TotalAbundance,na.rm = T),stringsAsFactors = F)
  })
##             Phylum mean_prevalence total_abundance
## 1 Actinobacteriota       17.360577         5639209
## 2     Bacteroidota        9.680556          463716
## 3       Firmicutes       13.007194        14976669
## 4   Fusobacteriota       11.756098          738780
## 5   Proteobacteria        8.857143           28778

2.2 Create or reorder factors

Create factor variables for downstream analyses

cat("Reorder BV score levels such that negative comes first.\n")
## Reorder BV score levels such that negative comes first.
levels(sample_data(ps1)$bvscat)
## [1] "Negative"     "Intermediate" "BV"
sample_data(ps1)$bvscat <- factor(sample_data(ps0.phylum.bact)$bvscat, levels = c("Negative" , "Intermediate", "BV"))
levels(sample_data(ps1)$bvscat)
## [1] "Negative"     "Intermediate" "BV"
#Create/convert STD information to factor
sample_data(ps1)$STI <- as.factor(sample_data(ps1)$STI)
sample_data(ps1)$Inflammation <- as.factor(sample_data(ps1)$Inflammation)
sample_data(ps1)$Chlamydia <- as.factor(sample_data(ps1)$Chlamydia)
sample_data(ps1)$Gonorrhoea <- as.factor(sample_data(ps1)$Gonorrhoea)
sample_data(ps1)$Trichomoniasis <- as.factor(sample_data(ps1)$Trichomoniasis)
sample_data(ps1)$Candidiasis <- as.factor(sample_data(ps1)$Candidiasis)
sample_data(ps1)$PSA <- as.factor(sample_data(ps1)$PSA)
sample_data(ps1)$`HSV.1` <- as.factor(sample_data(ps1)$`HSV.1`)
sample_data(ps1)$`HSV.2` <- as.factor(sample_data(ps1)$`HSV.2`)
sample_data(ps1)$sim_CST <- factor(sample_data(ps1)$sim_CST,  levels = c("I", "II", "III",  "IV-A", "IV-B", "IV-C", "V"))
sample_data(ps1)$sim_subCST <- as.factor(sample_data(ps1)$sim_subCST)

2.3 Subset Samples with visits at all 3 time points

The CAPRISA 083 study screened 267 HIV-uninfected women, and enrolled 101 women, of whom 56 had genital specimen stored accross 3 time points that were used to conduct the longitudinal microbiota and cytokine analysis.

## Create dataset of participants followed accross all three visits

#Extract sample data
sample.data.ps1 <- data.frame(sample_data(ps1))

#Get visit 2 and 3 ids
visit1.ids <- sample.data.ps1 %>% subset(VisitCode == 1000) %>% pull('ParticipantID')
visit1.ids <- visit1.ids[!visit1.ids %in% c(120117, 120016, 120060, 120221, 120229, 120223, 120010)]
visit2.ids <- sample.data.ps1 %>% subset(VisitCode == 1020) %>% pull('ParticipantID') 
visit3.ids <- sample.data.ps1 %>% subset(VisitCode == 1030) %>% pull('ParticipantID')

#IDs of followed samples
ids.v2.not.in.v1 <-  subset(visit2.ids, !visit2.ids %in% visit1.ids)
ids.v2.not.in.v3 <-  subset(visit2.ids, !visit2.ids %in% visit3.ids)
ids.v3.not.in.v1 <-  subset(visit3.ids, !visit3.ids %in% visit1.ids)
ids.v3.not.in.v2 <-  subset(visit3.ids, !visit3.ids %in% visit2.ids)


#All missing visit 1 from visit 2 and 3
ids.missing.visit1  <-  c(ids.v2.not.in.v1, ids.v3.not.in.v1)

all.missing <- unique(c(ids.missing.visit1, ids.v3.not.in.v2, ids.v2.not.in.v3))

ids.followed.v2v3 <- unique(c(visit2.ids, visit3.ids))

#visit 1, 2 and 3 intersections
v1v2.ids <- unique(subset(sample.data.ps1, ParticipantID %in% visit1.ids & ParticipantID %in% visit2.ids) %>% 
                      pull('ParticipantID'))
v1v3.ids <- unique(subset(sample.data.ps1, ParticipantID %in% visit1.ids & ParticipantID %in% visit3.ids) %>% 
                      pull('ParticipantID'))
v2v3.ids <- unique(subset(sample.data.ps1, ParticipantID %in% visit2.ids & ParticipantID %in% visit3.ids) %>% 
                      pull('ParticipantID'))

##Get dataset with followed samples
ps2.v1v2v3 <- subset_samples(ps1, (sample_data(ps1)$ParticipantID %in% ids.followed.v2v3 & 
                               !sample_data(ps1)$ParticipantID %in% all.missing))
ps2.v1v2v3
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 1351 taxa and 168 samples ]
## sample_data() Sample Data:       [ 168 samples by 79 sample variables ]
## tax_table()   Taxonomy Table:    [ 1351 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 1351 tips and 1349 internal nodes ]
#Samples followed through the 3 times
ps2.v1v2v3.ids <- unique(sample_data(ps2.v1v2v3)$ParticipantID)
saveRDS(ps2.v1v2v3.ids, "results/v1v2v3-ids.txt")

sample_data(ps2.v1v2v3)$VisitCode <- factor(sample_data(ps2.v1v2v3)$VisitCode, levels = c(1000, 1020, 1030))

#IDs present in visit 1, 2 and 3
ids.followed.v1v2v3 <- unique(sample_data(ps2.v1v2v3)$ParticipantID)
ids.followed.v1v2v3 <- c(ids.followed.v1v2v3) #Plus non bv ids

#Draw venn diagram of visit distribution 
draw.triple.venn(area1 = length(visit1.ids), area2 = length(visit2.ids), area3 = length(visit3.ids), n12 = length(v1v2.ids), n23 = length(v2v3.ids), n13 =length(v1v3.ids), n123 = length(ids.followed.v1v2v3), category = c("Visit1", "Visit2", "Visit3"), lty = "blank", fill = c("skyblue", "pink1", "mediumorchid"), cex=2, cat.cex = 2, cat.fontfamily = rep("serif", 3))

## (polygon[GRID.polygon.910], polygon[GRID.polygon.911], polygon[GRID.polygon.912], polygon[GRID.polygon.913], polygon[GRID.polygon.914], polygon[GRID.polygon.915], text[GRID.text.916], text[GRID.text.917], text[GRID.text.918], text[GRID.text.919], text[GRID.text.920], text[GRID.text.921], text[GRID.text.922], text[GRID.text.923], text[GRID.text.924], text[GRID.text.925])
sample_data(ps2.v1v2v3)$bv.persisted <- 0
sample_data(ps2.v1v2v3)$bv.recurred  <- 0
sample_data(ps2.v1v2v3)$bv.cleared   <- 0

ps2.v1v2v3
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 1351 taxa and 168 samples ]
## sample_data() Sample Data:       [ 168 samples by 82 sample variables ]
## tax_table()   Taxonomy Table:    [ 1351 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 1351 tips and 1349 internal nodes ]

2.4 Summary statistics

## Check metadata structure for conformity
#str(sample_data(ps2.v1v2v3))

## Check variables names
## cat("Sample data variable names:\n\n")
## colnames(sample_data(ps2.v1v2v3))

## What is the distribution of bv accross visits?
cat("\n\nDistribution of bv accross visits:\n\n")
## 
## 
## Distribution of bv accross visits:
table(sample_data(ps2.v1v2v3)$bvscat, sample_data(ps2.v1v2v3)$VisitCode)
##               
##                1000 1020 1030
##   Negative        0   17   13
##   Intermediate   22   19   26
##   BV             34   20   17
## What is the distribution of csts accross visits
cat("\n\nDistribution of CSTs accross visits:\n\n")
## 
## 
## Distribution of CSTs accross visits:
table(sample_data(ps2.v1v2v3)$sim_CST, sample_data(ps2.v1v2v3)$VisitCode)
##       
##        1000 1020 1030
##   I       1    2    3
##   III    15   22   26
##   IV-A   17   11    7
##   IV-B   23   20   20
##   IV-C    0    1    0
## What is the distribution of STIs accross visits
cat("\n\nDistribution of STIs accross visits:\n\n")
## 
## 
## Distribution of STIs accross visits:
table(sample_data(ps2.v1v2v3)$STI, sample_data(ps2.v1v2v3)$VisitCode)
##    
##     1000 1020 1030
##   0   25   44   42
##   1   31   12   14
## What is the distribution of inflammation accross visits
cat("\n\nDistribution of inflammation accross visits:\n\n")
## 
## 
## Distribution of inflammation accross visits:
table(sample_data(ps2.v1v2v3)$Inflammation, sample_data(ps2.v1v2v3)$VisitCode)
##    
##     1000 1020 1030
##   0   44   46   47
##   1   12   10    9
### Load visit 1 analysis specific packages
# Set up required packages
.cran_packages <- c("ggpubr", "mclust", "dunn.test", "cluster")
.bioc_packages <- c()

.inst <- .cran_packages %in% installed.packages()
if(any(!.inst)) {
  install.packages(.cran_packages[!.inst])
}
.inst <- .bioc_packages %in% installed.packages()
if(any(!.inst)) {
  source("http://bioconductor.org/biocLite.R")
  biocLite(.bioc_packages[!.inst], ask = F)
}
# Load packages into session, and print package version
sapply(c(.cran_packages, .bioc_packages), require, character.only = TRUE)
##    ggpubr    mclust dunn.test   cluster 
##      TRUE      TRUE      TRUE      TRUE

2.4.1 Sample sequencing depth

## Sequencing depth summary
dt.ps2.v1v2v3.sample_data = cbind(as(sample_data(ps2.v1v2v3), "data.frame"),
                          TotalReads = sample_sums(ps2.v1v2v3)) ##Total reads is sequence depth
gg.SeqDepth = ggplot(dt.ps2.v1v2v3.sample_data, aes(TotalReads)) + geom_histogram() + ggtitle("Sequencing Depth")
ggplotly(gg.SeqDepth)

The graph shows how many reads each sample has. The Y-axis shows the number samples and the X-axis shows the total number of reads for those samples

2.4.2 Sample distribution accross visits

How are the participants distributed accross visits?

## Distribution of BV accross visits
cat("BV status")
## BV status
table(sample_data(ps2.v1v2v3)$bvscat, sample_data(ps2.v1v2v3)$VisitCode)
##               
##                1000 1020 1030
##   Negative        0   17   13
##   Intermediate   22   19   26
##   BV             34   20   17
cat("\nInflammation")
## 
## Inflammation
table(sample_data(ps2.v1v2v3)$Inflammation, sample_data(ps2.v1v2v3)$VisitCode)
##    
##     1000 1020 1030
##   0   44   46   47
##   1   12   10    9
cat("\nInflammation vs BV")
## 
## Inflammation vs BV
table(sample_data(ps2.v1v2v3)$Inflammation, sample_data(ps2.v1v2v3)$bvscat)
##    
##     Negative Intermediate BV
##   0       26           55 56
##   1        4           12 15
cat("\nSTI")
## 
## STI
table(sample_data(ps2.v1v2v3)$STI, sample_data(ps2.v1v2v3)$VisitCode)
##    
##     1000 1020 1030
##   0   25   44   42
##   1   31   12   14

2.5 Estimate richness and transform to relative abundance

## Estimate richness
ps2.v1v2v3.rich <- cestimate_richness(ps2.v1v2v3)

## Aglomerate to species
ps2.v1v2v3.glom.species <- tax_glom(ps2.v1v2v3.rich, "Species")

## Transform to relative abundance
ps2.v1v2v3.glom.species.ra <- transform_sample_counts(ps2.v1v2v3.glom.species, function(x) {
  x/sum(x)
})
otu_table(ps2.v1v2v3.glom.species.ra)[is.nan(otu_table(ps2.v1v2v3.glom.species.ra))] <- 0

## Distribution of samples accross CSTS
levels(sample_data(ps2.v1v2v3.glom.species.ra)$sim_CST)
## [1] "I"    "III"  "IV-A" "IV-B" "IV-C"
table(sample_data(ps2.v1v2v3.glom.species.ra)$sim_CST)
## 
##    I  III IV-A IV-B IV-C 
##    6   63   35   63    1
table(sample_data(ps2.v1v2v3.glom.species.ra)$sim_CST, sample_data(ps2.v1v2v3.glom.species.ra)$VisitCode)
##       
##        1000 1020 1030
##   I       1    2    3
##   III    15   22   26
##   IV-A   17   11    7
##   IV-B   23   20   20
##   IV-C    0    1    0
table(sample_data(ps2.v1v2v3.glom.species.ra)$sim_CST, sample_data(ps2.v1v2v3.glom.species.ra)$bvscat)
##       
##        Negative Intermediate BV
##   I           4            2  0
##   III        24           26 13
##   IV-A        0            9 26
##   IV-B        2           30 31
##   IV-C        0            0  1
#Sort by CTS snd Shannon indexes before proceeding
sample_data(ps2.v1v2v3.glom.species.ra) <- sample_data(ps2.v1v2v3.glom.species.ra)[order(sample_data(ps2.v1v2v3.glom.species.ra)$sim_CST, sample_data(ps2.v1v2v3.glom.species.ra)$ShannonLgT), ]

2.6 Alpha and beta diversity ordinations

##               
##                1000 1020 1030
##   Negative        0   17   13
##   Intermediate   22   19   26
##   BV             34   20   17
## numeric(0)

2.6.1 Change in Shannon Diversity accross visits (Overall significance)

sd.ps2.v1v2v3.glom.species.ra <- data.frame(sample_data(ps2.v1v2v3.glom.species.ra))
ggpaired(sd.ps2.v1v2v3.glom.species.ra, x = "VisitCode", y = "ShannonLgT",
         color = "VisitCode", line.color = "gray", line.size = 0.4,
         palette = visitsPallet, id = "ParticipantID", width = 0.1 ) +
  scale_x_discrete(breaks=c(1000, 1020, 1030), labels=c("Baseline", "Visit 2", "Visit 3")) +
  scale_color_discrete(name = "Clinical Visits", labels = c("Baseline", "Visit 2", "Visit 3")) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  theme_light() +
  ylab("Shannon Diversity") +
  xlab("Clinical Visits") +
  stat_compare_means(paired = TRUE)

2.6.2 Change in Shannon diversity levels accross visits (compare visits)

## Change in Shannon diversity accross visits
my_comparisons <- list( c("1000", "1020"), c("1020", "1030"))
ggpaired(sd.ps2.v1v2v3.glom.species.ra, x = "VisitCode", y = "ShannonLgT",
         color = "VisitCode", line.color = "gray", line.size = 0.4,
         palette = visitsPallet, id = "ParticipantID", width = 0.1 ) +
  scale_x_discrete(breaks=c(1000, 1020, 1030), labels=c("Baseline", "Visit 2", "Visit 3")) +
  scale_color_discrete(name = "Clinical Visits", labels = c("Baseline", "Visit 2", "Visit 3")) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  theme_light() +
  ylab("Shannon Diversity") +
  xlab("Clinical Visits") +
  stat_compare_means(paired = TRUE, comparisons = my_comparisons)

PS: Legends for the annotations will be imposed in the final graphics using a different program. However, for BV, No BV=white, intermediate=grey and bv positive=maroon. For the rest of the annotations, color=positive.

## 
##  Shapiro-Wilk normality test
## 
## data:  sd.ps2.v1v2v3.glom.species.ra$ShannonLgT
## W = 0.86853, p-value = 0.00000000005878

## 
##  Kruskal-Wallis rank sum test
## 
## data:  sd.ps2.v1v2v3.glom.species.ra$ShannonLgT and as.factor(sd.ps2.v1v2v3.glom.species.ra$bvscat)
## Kruskal-Wallis chi-squared = 9.3617, df = 2, p-value = 0.009271
##   Kruskal-Wallis rank sum test
## 
## data: x and group
## Kruskal-Wallis chi-squared = 9.3617, df = 2, p-value = 0.01
## 
## 
##                            Comparison of x by group                            
##                                 (No adjustment)                                
## Col Mean-|
## Row Mean |         BV   Intermed
## ---------+----------------------
## Intermed |   1.002800
##          |     0.1580
##          |
## Negative |   3.054965   2.250734
##          |    0.0011*    0.0122*
## 
## alpha = 0.05
## Reject Ho if p <= alpha/2

## 
##  Kruskal-Wallis rank sum test
## 
## data:  sd.ps2.v1v2v3.glom.species.ra$ShannonLgT and sd.ps2.v1v2v3.glom.species.ra$sim_CST
## Kruskal-Wallis chi-squared = 29.005, df = 4, p-value = 0.000007798
##   Kruskal-Wallis rank sum test
## 
## data: x and group
## Kruskal-Wallis chi-squared = 29.0052, df = 4, p-value = 0
## 
## 
##                            Comparison of x by group                            
##                                 (No adjustment)                                
## Col Mean-|
## Row Mean |          I        III       IV-A       IV-B
## ---------+--------------------------------------------
##      III |  -0.701540
##          |     0.2415
##          |
##     IV-A |  -2.434721  -3.681219
##          |    0.0075*    0.0001*
##          |
##     IV-B |  -2.543799  -4.417582  -0.052319
##          |    0.0055*    0.0000*     0.4791
##          |
##     IV-C |   0.618589   0.960292   1.719560   1.741218
##          |     0.2681     0.1685     0.0428     0.0408
## 
## alpha = 0.05
## Reject Ho if p <= alpha/2

## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  ShannonLgT by STI
## W = 3157, p-value = 0.984
## alternative hypothesis: true location shift is not equal to 0
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  ShannonLgT by Inflammation
## W = 2119, p-value = 0.987
## alternative hypothesis: true location shift is not equal to 0
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  ShannonLgT by PSA
## W = 2309, p-value = 0.6626
## alternative hypothesis: true location shift is not equal to 0
## 
## Call:  glm(formula = ShannonLgT ~ bvscat * Inflammation, data = sd.ps2.v1v2v3.glom.species.ra)
## 
## Coefficients:
##                      (Intercept)                bvscatIntermediate  
##                          0.55241                           0.02717  
##                         bvscatBV                     Inflammation1  
##                          0.05246                          -0.05062  
## bvscatIntermediate:Inflammation1            bvscatBV:Inflammation1  
##                          0.05141                           0.01827  
## 
## Degrees of Freedom: 167 Total (i.e. Null);  162 Residual
## Null Deviance:       2.581 
## Residual Deviance: 2.502     AIC: -216

2.7 Betadiversity

2.7.1 Batch effects - sequencing run

Do we see any effect of sequencing run that can confound our analysis?

evals <- ord.ps2.v1v2v3.glom.species.ra.pcoa.bray$values$Eigenvalues
p.pcoa.bray <- plot_ordination(ps2.v1v2v3.glom.species.ra, ord.ps2.v1v2v3.glom.species.ra.pcoa.bray, color = "SeqRun", axes = c(1,2)) +
  geom_point(size = 2) +
  scale_fill_manual(values = CSTsPallete) +
  labs(title = "PCoA of Bray Curtis (Abundance)", color = "SeqRuns") +
  theme_minimal() +
  coord_fixed(sqrt(evals[2] / evals[1])) #+
  #stat_ellipse(type = "t")
p.pcoa.bray

2.7.2 Do we see any clustering CSTs

## Cross tabulation of CTs and BV Score
##       
##        Negative Intermediate BV
##   I           4            2  0
##   III        24           26 13
##   IV-A        0            9 26
##   IV-B        2           30 31
##   IV-C        0            0  1
## Cross tabulation of CTs and Inflamation
##       
##         0  1
##   I     5  1
##   III  51 12
##   IV-A 31  4
##   IV-B 49 14
##   IV-C  1  0

2.8 Differential analyses

In this section we focus only on participants with all 3 visits. We assess the microbiome dynamics accross visits to establish treatment effects on the microbiome profiles of young women in the study.

Ordinations

2.9 Individual Visits Ordinations

2.9.1 Visit 1 without BV

What is the normal characterization of bacterial flora in women within this community that do not have BV?

## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 62 taxa and 51 samples ]
## sample_data() Sample Data:       [ 51 samples by 83 sample variables ]
## tax_table()   Taxonomy Table:    [ 62 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 62 tips and 61 internal nodes ]

2.9.2 Visit 1 only, the 56 with BV

## [1] "I"    "III"  "IV-A" "IV-B"
## NULL

Visit 1 biplots

2.9.3 Visit 2 only

Visit three heatmaps and bar plots

Visit 3 PCA biplots

2.9.4 Combined distribution of most abundant taxa

2.10 BV Treated - Cleared - Recurred (At visit level)

Here we consider the relative abundance of bacteria among the participants who were bv intermediate or positive, had cleared in visit2 and tested positive or intermediate in visit 3

### BV treated - cleared - recurred (at individual level)

2.10.1 Transition of participants accross CSTs

2.10.2 Stacked barplot of taxa present in samples split by CST

2.10.3 Alluvial plot showing transition of samples accross CSTs

ps2.v1.alluv <- subset_samples(ps2.v1v2v3.glom.species.ra, VisitCode == 1000)  
ps2.v1.alluv_sd <- data.frame(sample_data(ps2.v1.alluv)[order(sample_data(ps2.v1.alluv)$SampleID, sample_data(ps2.v1.alluv)$VisitCode),])

ps2.v2.alluv <- subset_samples(ps2.v1v2v3.glom.species.ra, VisitCode == 1020)
ps2.v2.alluv_sd <- data.frame(sample_data(ps2.v2.alluv)[order(sample_data(ps2.v2.alluv)$SampleID, sample_data(ps2.v2.alluv)$VisitCode),])

ps2.v3.alluv <- subset_samples(ps2.v1v2v3.glom.species.ra, VisitCode == 1030)
ps2.v3.alluv_sd <-  data.frame(sample_data(ps2.v3.alluv)[order(sample_data(ps2.v3.alluv)$SampleID, sample_data(ps2.v3.alluv)$VisitCode),])

alluvialData1 <- inner_join(ps2.v1.alluv_sd[, c("ParticipantID", "sim_CST")], ps2.v2.alluv_sd[, c("ParticipantID", "sim_CST")], by = "ParticipantID")
alluvialData1 <- inner_join(alluvialData1, ps2.v3.alluv_sd[, c("ParticipantID", "sim_CST")], by = "ParticipantID")
alluvialData1 <- alluvialData1[, (2:4)]
colnames(alluvialData1) <- c("Visit1", "Visit2", "Visit3")
alluvialData1 <- alluvialData1 %>% dplyr::group_by(Visit1, Visit2, Visit3) %>% dplyr::summarise(n = n()) 
alluvial(
    alluvialData1[, 1:3],
    freq=alluvialData1$n,
    col = ifelse( alluvialData1$Visit1 == "I", CSTsPallete[1], 
                  ifelse( alluvialData1$Visit1 == "II", CSTsPallete[2], 
                          ifelse( alluvialData1$Visit1 == "III", CSTsPallete[3], 
                                  ifelse( alluvialData1$Visit1 == "IV-A", CSTsPallete[4],  
                                          ifelse( alluvialData1$Visit1 == "IV-B", CSTsPallete[5], 
                                                  ifelse( alluvialData1$Visit1 == "IV-C", CSTsPallete[6], 
                                                          ifelse( alluvialData1$Visit1 == "V", CSTsPallete[7], ""))))))),
    border = c("white"),
    alpha = 0.8,
    blocks=FALSE,
    axis_labels = c("Baseline", "6 weeks", "3 months")
  )

2.10.4 Stats / Proportions

##        I III IV.A IV.B IV.C
## Visit1 1  15   17   23    0
## Visit2 2  22   11   20    1
## Visit3 3  26    7   20    0
##          Visit1     Visit2     Visit3
## I    0.01785714 0.03571429 0.05357143
## III  0.26785714 0.39285714 0.46428571
## IV.A 0.30357143 0.19642857 0.12500000
## IV.B 0.41071429 0.35714286 0.35714286
## IV.C 0.00000000 0.01785714 0.00000000

2.10.5 PCA biplot of V1 V2 taxa and cytokine fold change

library(corrplot)
profInfC <- c("TNF.b"  , "IL.12p40" , "IL.12p70" , "IL.1a" , "IL.6" , "TNF.a" , "IL.1b", "IL.18" , "MIF" , "TRAIL")


#View(data.frame(sample_data(ps2.v1v2v3.cst.glom.Species.ra)))
ps2.v1.cyto <- prune_samples(sample_data(ps2.v1v2v3.glom.species.ra)$VisitCode == 1000, ps2.v1v2v3.glom.species.ra)
ps2.v1.cyto <- prune_taxa(taxa_sums(ps2.v1.cyto) > 0, ps2.v1.cyto)

ps2.v2.cyto <- prune_samples(sample_data(ps2.v1v2v3.glom.species.ra)$VisitCode == 1020, ps2.v1v2v3.glom.species.ra)
ps2.v2.cyto <- prune_taxa(taxa_sums(ps2.v2.cyto) > 0, ps2.v2.cyto)

visit1cyto <- subset(data.frame(sample_data(ps2.v1v2v3.glom.species.ra)), VisitCode == 1000, select = c("TNF.b"  , "IL.12p40" , "IL.12p70" , "IL.1a" , "IL.6" , "TNF.a" , "IL.1b", "IL.18" , "MIF" , "TRAIL"))
visit2cyto <- subset(data.frame(sample_data(ps2.v1v2v3.glom.species.ra)), VisitCode == 1020, select = c("TNF.b"  , "IL.12p40" , "IL.12p70" , "IL.1a" , "IL.6" , "TNF.a" , "IL.1b", "IL.18" , "MIF" , "TRAIL"))

visit1cyto$PID <- as.numeric(substr(rownames(visit1cyto), 1, nchar(rownames(visit1cyto))-1))
visit2cyto$PID <- as.numeric(substr(rownames(visit2cyto), 1, nchar(rownames(visit2cyto))-1))


idsInBoth <- intersect(visit1cyto$PID, visit2cyto$PID)

visit1cyto <- visit1cyto[visit1cyto$PID %in% idsInBoth, ]
visit2cyto <- visit2cyto[visit2cyto$PID %in% idsInBoth, ]

## order by rownames of v2c
visit1cyto <- visit1cyto[with(visit2cyto, order(rownames(visit1cyto))), ]
visit12FC <- (visit2cyto - visit1cyto[with(visit2cyto, order(rownames(visit1cyto))), ])
visit12FC$PID <- NULL ##drop PID column


##V1 samples
V1_otu_tab <- data.frame(otu_table(ps2.v1.cyto))
V1_otu_tab1 <- as.data.frame(t(V1_otu_tab))
V1_otu_tab1$OTU <- rownames(V1_otu_tab1)

V1_tax_tab <- data.frame(tax_table(ps2.v1.cyto))
V1_tax_tab$OTU <- rownames(V1_tax_tab)

otuWSpeciesV1 <- merge(V1_otu_tab1, V1_tax_tab[, c("OTU", "Species")], by = "OTU")
otuWSpeciesV1 <- otuWSpeciesV1[, -1]
otuWSpeciesV1 <- otuWSpeciesV1[!duplicated(otuWSpeciesV1$Species),]
rownames(otuWSpeciesV1) <- otuWSpeciesV1$Species
t.otuWSpeciesV1 <- data.frame(t(otuWSpeciesV1[, -ncol(otuWSpeciesV1)]))


##V2 samples
V2_otu_tab <- data.frame(otu_table(ps2.v2.cyto))
V2_otu_tab1 <- as.data.frame(t(V2_otu_tab))
V2_otu_tab1$OTU <- rownames(V2_otu_tab1)

V2_tax_tab <- data.frame(tax_table(ps2.v2.cyto))
V2_tax_tab$OTU <- rownames(V2_tax_tab)

otuWSpeciesV2 <- merge(V2_otu_tab1, V1_tax_tab[, c("OTU", "Species")], by = "OTU")
otuWSpeciesV2 <- otuWSpeciesV2[, -1]
otuWSpeciesV2 <- otuWSpeciesV2[!duplicated(otuWSpeciesV2$Species),]
rownames(otuWSpeciesV2) <- otuWSpeciesV2$Species
t.otuWSpeciesV2 <- data.frame(t(otuWSpeciesV2[, -ncol(otuWSpeciesV2)]))

## resolve ids
t.otuWSpeciesV1$PID <- as.numeric(substr(rownames(t.otuWSpeciesV1), 1, nchar(rownames(t.otuWSpeciesV1))-1))
t.otuWSpeciesV2$PID <- as.numeric(substr(rownames(t.otuWSpeciesV2), 1, nchar(rownames(t.otuWSpeciesV2))-1))

## Taxa in both
taxaInBoth <- intersect(colnames(t.otuWSpeciesV1), colnames(t.otuWSpeciesV2))

t.otuWSpeciesV1 <- t.otuWSpeciesV1[t.otuWSpeciesV1$PID %in% idsInBoth, taxaInBoth]
t.otuWSpeciesV2 <- t.otuWSpeciesV2[t.otuWSpeciesV2$PID %in% idsInBoth, taxaInBoth]


##Fold Change
otuWSpeciesFC <- (t.otuWSpeciesV2 - t.otuWSpeciesV1[with(t.otuWSpeciesV2, order(rownames(t.otuWSpeciesV1))), ])
otuWSpeciesFC$PID <- NULL ##drop PID column

cyto.spec <- cbind(otuWSpeciesFC, visit12FC)
cyto.spec.pca <- PCA(scale(as.matrix(cyto.spec), center = TRUE, scale = TRUE), scale.unit = FALSE, graph = FALSE)


corrplot::corrplot(cor(otuWSpeciesFC , visit12FC), type="full",
         p.mat = cyto.spec.pca$P, sig.level = 0.001, insig = "blank")

#par(cex = cex.before)

v2sd <- data.frame(sample_data(ps2.v2.cyto))
ps2.v2.cyto.ha <- subset_samples(ps2.v2.cyto, get_variable(ps2.v2.cyto, "ParticipantID") %in% idsInBoth)
fviz_pca_biplot(cyto.spec.pca,
             #individuals
             geom.ind = "point", # show points only (but not "text"),
             habillage = (data.frame(sample_data(ps2.v2.cyto.ha)))$sim_CST,
             col.var = "black",

             #variables
             alpha.var =  "cos2",
             palette = CSTsPallete,
             legend.title = "sim_CST",
             title = "PCA plot Showing the Relationship Between Bacteria and Proinflammatory Cytokines",
             repel = TRUE)+
  scale_color_brewer(palette="Dark2")+
  theme_minimal(base_size = 14) +
  theme(text = element_text(face = "bold"))

## Visit 2 (treatment)
ps2.v2.cst.glom.species.ra <- subset_samples(ps2.v1v2v3.glom.species.ra, get_variable(ps2.v1v2v3.glom.species.ra, "VisitCode") == 1020)
ps2.v2.cst.glom.species.ra <- prune_taxa(taxa_sums(ps2.v2.cst.glom.species.ra) >0, ps2.v2.cst.glom.species.ra)
ps2.v2.cst.glom.species.ra
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 25 taxa and 56 samples ]
## sample_data() Sample Data:       [ 56 samples by 87 sample variables ]
## tax_table()   Taxonomy Table:    [ 25 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 25 tips and 24 internal nodes ]
keyCytokines <- c("G.CSF", "IL.1a", "IL.1b", "M.CSF")

ps2.v2.cyto <- subset_taxa(ps2.v2.cst.glom.species.ra, Species %in% keyTaxa)
taxa_names(ps2.v2.cyto) <- tax_table(ps2.v2.cyto)[, "Species"]

otu.ps2.v2.cyto <- data.frame(otu_table(ps2.v2.cyto))
df.ps2.v2.cyto <- data.frame(sample_data(ps2.v2.cyto)[, c(keyCytokines)])
dt.ps2.v2.cyto.otu <- cbind(otu.ps2.v2.cyto, df.ps2.v2.cyto)

dt.ps2.v2.pca <- PCA(scale(as.matrix(dt.ps2.v2.cyto.otu), center = TRUE, scale = TRUE), scale.unit = FALSE, graph = FALSE)

fviz_pca_biplot(dt.ps2.v2.pca,
             #individuals
             geom.ind = "point", # show points only (but not "text"),
             habillage = (data.frame(sample_data(ps2.v2.cyto)))$sim_CST,
             col.var = "black",

             #variables
             alpha.var =  "cos2",
             palette = CSTsPallete,
             legend.title = "sim_CST",
             title = "PCA plot Showing the Relationship Between Bacteria and Proinflammatory Cytokines",
             repel = TRUE)+
  scale_color_brewer(palette="Dark2")+
  theme_minimal(base_size = 14) +
  theme(text = element_text(face = "bold"))

### Correlation analyses:
library("Hmisc")

flattenCorrMatrix <- function(cormat, pmat) {
  ut <- upper.tri(cormat)
  data.frame(
    row = rownames(cormat)[row(cormat)[ut]],
    column = rownames(cormat)[col(cormat)[ut]],
    cor  =(cormat)[ut],
    p = pmat[ut]
    )
}

res2<-rcorr(as.matrix(dt.ps2.v2.cyto.otu))

## flattenCorrMatrix(res2$r, res2$P)

2.10.6 Additional PCA Plots

Only Cleared

## 
##  0  1 
## 43 13
## 
##  0  1 
## 29 27
## 
##  0  1 
## 40 16

Only persisted

Only recurred

2.11 DESeq2

## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 1276 taxa and 168 samples ]
## sample_data() Sample Data:       [ 168 samples by 82 sample variables ]
## tax_table()   Taxonomy Table:    [ 1276 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 1276 tips and 1274 internal nodes ]

## [1] "Intercept"              "VisitCode_1020_vs_1000" "VisitCode_1030_vs_1000"
## 
## out of 63 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 21, 33%
## LFC < 0 (down)     : 27, 43%
## outliers [1]       : 0, 0%
## low counts [2]     : 0, 0%
## (mean count < 1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
## [1] 63
## [1] 45
##  [1] "Row.names"      "baseMean"       "log2FoldChange" "lfcSE"         
##  [5] "stat"           "pvalue"         "padj"           "Kingdom"       
##  [9] "Phylum"         "Class"          "Order"          "Family"        
## [13] "Genus"          "Species"

## 
## out of 63 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 14, 22%
## LFC < 0 (down)     : 34, 54%
## outliers [1]       : 0, 0%
## low counts [2]     : 0, 0%
## (mean count < 1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
## [1] 63
## [1] 45
##  [1] "Row.names"      "baseMean"       "log2FoldChange" "lfcSE"         
##  [5] "stat"           "pvalue"         "padj"           "Kingdom"       
##  [9] "Phylum"         "Class"          "Order"          "Family"        
## [13] "Genus"          "Species"

2.12 Cytokine heatmaps All Visits

Cytokine Heatmaps - All visits

significant_cytokines <- c("PC1", "IL.1a" ,"IL.1b", "IL.6","IL.12p40","IL.12p70","IL.18","MIF", "TNF.a", "TNF.b","TRAIL", "MIP.1b", "CTACK", "GROa", "IP.10", "MIG", "RANTES", "LIF", "SCF" , "SCGF.b" ,  "IL.5")

df.all.cyto.melis.scale.sub <- df.all.cyto.melis.scale[, significant_cytokines]

df.all.cyto.melis.scale.sub.t <- t(df.all.cyto.melis.scale.sub)

#[order(rownames(df.all.cyto.melis.scale.t.sub)[significant_cytokines]),]

## Plot cytokine heatmaps of all Cytokines
plotHeatmaps(df.all.cyto.melis.scale.sub.t, df.all.sampledata.melis,  myAnnotColmns)

###Print them all out
myAnnotColmnsb <- c( "sim_subCST")

plotHeatmaps_b <- function(dataFr, dataFr2, annotColmnsb){
   require(gplots)
   for(i in c(1:length(annotColmnsb))){
    colName <- annotColmnsb[i]
    #Define color vector
    colVec <- as.vector(dataFr2[, colName])
    colVec <- replace(colVec, which(colVec == "I-A"),  CSTsPallete[1])
    colVec <- replace(colVec, which(colVec == "I-B"),  CSTsPallete[2])
    colVec <- replace(colVec, which(colVec == "III-A"),CSTsPallete[5])
    colVec <- replace(colVec, which(colVec == "III-B"),CSTsPallete[6])
    colVec <- replace(colVec, which(colVec == "IV-A"), CSTsPallete[7])
    colVec <- replace(colVec, which(colVec == "IV-B"), CSTsPallete[8])
    colVec <- replace(colVec, which(colVec == "IV-C0"),CSTsPallete[9])
    
    #Generate heatmaps
    heatmap.2(dataFr,
          main = paste0("Heatmap with ", colName, " annotation"),
          hclustfun = hclust2,
          distfun=dist2,
          trace="none",         # turns off trace lines inside the heat map
          col=my_palette,       # use on color palette defined earlier
          breaks=col_breaks,    # enable color transition at specified limits
          sepcolor="gray70",    # determines the separation color
          colsep=0:ncol(dataFr), # determines where column separators go
          rowsep=0:nrow(dataFr), # determines where row separators go
          sepwidth=c(0.01,0.01), # determines the width of the separators
          density.info="none", # turns off density plot inside color legend
          keysize = 1.5,
          labCol = FALSE, #remove column names
          ColSideColors=colVec,
          key = T
    )  
   }
}

## Plot cytokine heatmaps of all Cytokines
plotHeatmaps_b(df.all.cyto.melis.scale.sub.t, df.all.sampledata.melis,  myAnnotColmnsb)

###Print them all out
myAnnotColmnsc <- c("bvscat")

plotHeatmaps_c <- function(dataFr, dataFr2, annotColmnsc){
   require(gplots)
   for(i in c(1:length(annotColmnsc))){
    colName <- annotColmnsc[i]
    #Define color vector
    colVec <- as.vector(dataFr2[, colName])
    colVec <- replace(colVec, which(colVec == "Negative"),"white")
    colVec <- replace(colVec, which(colVec == "Intermediate"),"grey60")
    colVec <- replace(colVec, which(colVec == "BV"),"maroon")
    
    #Generate heatmaps
    heatmap.2(dataFr,
          main = paste0("Heatmap with ", colName, " annotation"),
          hclustfun = hclust2,
          distfun=dist2,
          trace="none",         # turns off trace lines inside the heat map
          col=my_palette,       # use on color palette defined earlier
          breaks=col_breaks,    # enable color transition at specified limits
          sepcolor="gray70",    # determines the separation color
          colsep=0:ncol(dataFr), # determines where column separators go
          rowsep=0:nrow(dataFr), # determines where row separators go
          sepwidth=c(0.01,0.01), # determines the width of the separators
          density.info="none", # turns off density plot inside color legend
          keysize = 1.5,
          labCol = FALSE, #remove column names
          ColSideColors=colVec,
          key = T
    )  
   }
}

## Plot cytokine heatmaps of all Cytokines
plotHeatmaps_c(df.all.cyto.melis.scale.sub.t, df.all.sampledata.melis,  myAnnotColmnsc)

2.13 Network analysis

Here we assess the likelihoods of transitioning between CSTs between visits. i.e. is the introduction of medication likely to cause a transition in CST? if yes, what are the most probable transitions?

2.14 Participant Transitions Across CTS

## Warning: package 'igraph' was built under R version 3.5.2
## Warning: package 'markovchain' was built under R version 3.5.2
##      igraph markovchain   dunn.test 
##        TRUE        TRUE        TRUE

Here we assess the effect of treatment on the CSTs status of participants between visit 1 and visit 2. Does treatment have an effect on the transtion between CSTs? and more importantly will there be significant transtion towards L. Crispatus? which is the desired state for a healthy vagina.

## 
##  Visit 1 - BV/CST Distribution
##               
##                 I III IV-A IV-B
##   Intermediate  1   7    7    7
##   BV            0   8   10   16
## 
##  Visit 2 - BV/CST Distribution
##               
##                 I III IV-A IV-B IV-C
##   Negative      1  14    0    2    0
##   Intermediate  1   6    1   11    0
##   BV            0   2   10    7    1
## 
##  Visit 3 - BV/CST Distribution
##               
##                 I III IV-A IV-B
##   Negative      3  10    0    0
##   Intermediate  0  13    1   12
##   BV            0   3    6    8
## Marked increase L.Crispatus after treatment and reduction in diversity
##       
##         I III IV-A IV-B
##   I     0   1    0    0
##   III   3   6    1    5
##   IV-A  0   6    4    7
##   IV-B  0  13    2    8
## Table showing transitions accross visits
 dt.ps2.v1v2.netM$CurrCST 
 I   III   IV-A   IV-B 
 dt.ps2.v1v2.netM$PrevCST 
   I  1
   III  3 6 1 5
   IV-A  6 4 7
   IV-B  13 2 8
   #Total cases  3 26 7 20
##       [,1]      [,2]       [,3]      [,4]
## I-A    0.0 0.0000000 0.00000000 0.0000000
## I-B    0.0 0.0000000 0.00000000 0.0000000
## III-A  0.0 1.0000000 0.00000000 0.0000000
## III-B  0.2 0.4000000 0.06666667 0.3333333
## IV-A   0.0 0.3529412 0.23529412 0.4117647
## IV-B   0.0 0.5652174 0.08695652 0.3478261
## [1] 0.0000000 0.0000000 0.0000000 0.9102392

2.14.1 Analysis Environment

sessionInfo()
## R version 3.5.0 (2018-04-23)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS  10.15.6
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
##  [1] parallel  stats4    grid      stats     graphics  grDevices utils    
##  [8] datasets  methods   base     
## 
## other attached packages:
##  [1] markovchain_0.6.9.14        igraph_1.2.4.1             
##  [3] gplots_3.0.1.1              EnhancedVolcano_1.0.1      
##  [5] Hmisc_4.2-0                 Formula_1.2-3              
##  [7] survival_2.44-1.1           corrplot_0.84              
##  [9] alluvial_0.1-2              plyr_1.8.4                 
## [11] ggsn_0.5.0                  factoextra_1.0.5           
## [13] FactoMineR_1.41             ggrepel_0.8.1              
## [15] cluster_2.0.9               dunn.test_1.3.5            
## [17] mclust_5.4.3                ggpubr_0.2                 
## [19] magrittr_1.5                biomformat_1.10.1          
## [21] ape_5.3                     reshape2_1.4.3             
## [23] scales_1.0.0                DESeq2_1.22.2              
## [25] SummarizedExperiment_1.12.0 DelayedArray_0.8.0         
## [27] BiocParallel_1.16.6         matrixStats_0.54.0         
## [29] Biobase_2.42.0              GenomicRanges_1.34.0       
## [31] GenomeInfoDb_1.18.2         IRanges_2.16.0             
## [33] S4Vectors_0.20.1            BiocGenerics_0.28.0        
## [35] phyloseq_1.26.1             dada2_1.10.1               
## [37] Rcpp_1.0.3                  VennDiagram_1.6.20         
## [39] futile.logger_1.4.3         stringi_1.4.3              
## [41] expss_0.8.11                plotly_4.9.0               
## [43] RColorBrewer_1.1-2          data.table_1.12.2          
## [45] vegan_2.5-5                 lattice_0.20-38            
## [47] permute_0.9-5               forcats_0.4.0              
## [49] stringr_1.4.0               dplyr_0.8.3                
## [51] purrr_0.3.3                 readr_1.3.1                
## [53] tidyr_1.0.0                 tibble_2.1.3               
## [55] ggplot2_3.2.1               tidyverse_1.2.1            
## [57] gridExtra_2.3               knitr_1.26                 
## 
## loaded via a namespace (and not attached):
##   [1] tidyselect_0.2.5         RSQLite_2.1.1            AnnotationDbi_1.44.0    
##   [4] htmlwidgets_1.5.1        maptools_0.9-5           munsell_0.5.0           
##   [7] codetools_0.2-16         units_0.6-2              withr_2.1.2             
##  [10] colorspace_1.4-1         rstudioapi_0.10          leaps_3.0               
##  [13] ggsignif_0.5.0           labeling_0.3             RgoogleMaps_1.4.5.3     
##  [16] GenomeInfoDbData_1.2.0   hwriter_1.3.2            bit64_0.9-7             
##  [19] rhdf5_2.26.2             vctrs_0.2.0              generics_0.0.2          
##  [22] lambda.r_1.2.3           xfun_0.11                R6_2.4.1                
##  [25] locfit_1.5-9.1           bitops_1.0-6             assertthat_0.2.1        
##  [28] promises_1.1.0           nnet_7.3-12              gtable_0.3.0            
##  [31] rlang_0.4.1              zeallot_0.1.0            genefilter_1.64.0       
##  [34] scatterplot3d_0.3-41     splines_3.5.0            lazyeval_0.2.2          
##  [37] acepack_1.4.1            broom_0.5.2              checkmate_1.9.3         
##  [40] yaml_2.2.0               modelr_0.1.4             crosstalk_1.0.0         
##  [43] backports_1.1.5          httpuv_1.5.2             tools_3.5.0             
##  [46] base64enc_0.1-3          zlibbioc_1.28.0          classInt_0.3-3          
##  [49] RCurl_1.95-4.12          rpart_4.1-15             ggmap_3.0.0             
##  [52] haven_2.2.0              futile.options_1.0.1     hms_0.5.2               
##  [55] mime_0.7                 evaluate_0.14            xtable_1.8-4            
##  [58] XML_3.98-1.20            jpeg_0.1-8.1             readxl_1.3.1            
##  [61] compiler_3.5.0           KernSmooth_2.23-15       crayon_1.3.4            
##  [64] htmltools_0.4.0          mgcv_1.8-28              later_1.0.0             
##  [67] geneplotter_1.60.0       expm_0.999-4             RcppParallel_4.4.2      
##  [70] lubridate_1.7.4          DBI_1.0.0                formatR_1.6             
##  [73] matlab_1.0.2             MASS_7.3-51.4            sf_0.7-4                
##  [76] ShortRead_1.40.0         Matrix_1.2-17            ade4_1.7-13             
##  [79] cli_1.1.0                gdata_2.18.0             pkgconfig_2.0.3         
##  [82] flashClust_1.01-2        GenomicAlignments_1.18.1 foreign_0.8-71          
##  [85] sp_1.3-2                 xml2_1.2.2               foreach_1.4.4           
##  [88] annotate_1.60.1          multtest_2.38.0          XVector_0.22.0          
##  [91] rvest_0.3.5              digest_0.6.22            Biostrings_2.50.2       
##  [94] rmarkdown_1.17           cellranger_1.1.0         htmlTable_1.13.1        
##  [97] gtools_3.8.1             shiny_1.4.0              Rsamtools_1.34.1        
## [100] rjson_0.2.20             lifecycle_0.1.0          nlme_3.1-140            
## [103] jsonlite_1.6             Rhdf5lib_1.4.3           viridisLite_0.3.0       
## [106] pillar_1.4.2             fastmap_1.0.1            httr_1.4.1              
## [109] glue_1.3.1               png_0.1-7                iterators_1.0.10        
## [112] bit_1.1-14               class_7.3-15             blob_1.1.1              
## [115] caTools_1.17.1.2         latticeExtra_0.6-28      memoise_1.1.0           
## [118] e1071_1.7-2